Download PDB stats data as CSV file from: http://www.rcsb.org/stats/summary
pdbstats <- read.csv("Data Export Summary.csv", row.names=1)
Lets look at table
library(knitr)
kable(pdbstats)
| Proteins | Nucleic.Acids | Protein.NA.Complex | Other | Total | |
|---|---|---|---|---|---|
| X-Ray | 122263 | 1960 | 6333 | 10 | 130566 |
| Other | 244 | 4 | 6 | 13 | 267 |
| NMR | 10898 | 1263 | 253 | 8 | 12422 |
| Multi Method | 119 | 5 | 2 | 1 | 127 |
| Electron Microscopy | 1822 | 31 | 657 | 0 | 2510 |
Q1. Determine the percentage of structures solved by X-Ray and Electron Microscopy. What proportion of structures are protein?
# total number of entries
nstru <- sum(pdbstats$Total)
precent <- round((pdbstats$Total / nstru)*100, 2)
A1. There are 89.49 % X-ray structures and 1.72 % EM structures in the PDB database as of 2018-11-06.
nstats <- pdbstats
nstats$Precent <- precent
kable(nstats)
| Proteins | Nucleic.Acids | Protein.NA.Complex | Other | Total | Precent | |
|---|---|---|---|---|---|---|
| X-Ray | 122263 | 1960 | 6333 | 10 | 130566 | 89.49 |
| Other | 244 | 4 | 6 | 13 | 267 | 0.18 |
| NMR | 10898 | 1263 | 253 | 8 | 12422 | 8.51 |
| Multi Method | 119 | 5 | 2 | 1 | 127 | 0.09 |
| Electron Microscopy | 1822 | 31 | 657 | 0 | 2510 | 1.72 |
The proportion of protein entries:
round((sum(nstats$Proteins) / nstru)*100,2)
## [1] 92.77
Any time we want to use a package we must load it with the library() function.
library(bio3d)
pdb <- read.pdb("1hsg")
## Note: Accessing on-line PDB file
pdb
##
## Call: read.pdb(file = "1hsg")
##
## Total Models#: 1
## Total Atoms#: 1686, XYZs#: 5058 Chains#: 2 (values: A B)
##
## Protein Atoms#: 1514 (residues/Calpha atoms#: 198)
## Nucleic acid Atoms#: 0 (residues/phosphate atoms#: 0)
##
## Non-protein/nucleic Atoms#: 172 (residues: 128)
## Non-protein/nucleic resid values: [ HOH (127), MK1 (1) ]
##
## Protein sequence:
## PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKMIGGIGGFIKVRQYD
## QILIEICGHKAIGTVLVGPTPVNIIGRNLLTQIGCTLNFPQITLWQRPLVTIKIGGQLKE
## ALLDTGADDTVLEEMSLPGRWKPKMIGGIGGFIKVRQYDQILIEICGHKAIGTVLVGPTP
## VNIIGRNLLTQIGCTLNF
##
## + attr: atom, xyz, seqres, helix, sheet,
## calpha, remark, call
Note that the attributes (+ attr:) of this object are listed on the last couple of lines. To find the attributes of any such object you can use:
attributes(pdb)
## $names
## [1] "atom" "xyz" "seqres" "helix" "sheet" "calpha" "remark" "call"
##
## $class
## [1] "pdb" "sse"
Ww can access each componet with the dollar syntax. For example:
head(pdb$atom)
## type eleno elety alt resid chain resno insert x y z o
## 1 ATOM 1 N <NA> PRO A 1 <NA> 29.361 39.686 5.862 1
## 2 ATOM 2 CA <NA> PRO A 1 <NA> 30.307 38.663 5.319 1
## 3 ATOM 3 C <NA> PRO A 1 <NA> 29.760 38.071 4.022 1
## 4 ATOM 4 O <NA> PRO A 1 <NA> 28.600 38.302 3.676 1
## 5 ATOM 5 CB <NA> PRO A 1 <NA> 30.508 37.541 6.342 1
## 6 ATOM 6 CG <NA> PRO A 1 <NA> 29.296 37.591 7.162 1
## b segid elesy charge
## 1 38.10 <NA> N <NA>
## 2 40.62 <NA> C <NA>
## 3 42.64 <NA> C <NA>
## 4 43.40 <NA> O <NA>
## 5 37.87 <NA> C <NA>
## 6 38.40 <NA> C <NA>
pdb$xyz
##
## Total Frames#: 1
## Total XYZs#: 5058, (Atoms#: 1686)
##
## [1] 29.361 39.686 5.862 <...> 30.112 17.912 -4.791 [5058]
##
## + attr: Matrix DIM = 1 x 5058
Plot of B-factor column
plot.bio3d(pdb$atom$b[pdb$calpha], sse=pdb, typ="l", ylab="B-factor")
We need to be able to get at select parts of protein structures (e.g. just the ligand, just the protein, or just chain A, etc.). For this we use the atom.select() function.
# Select all C-alpha atoms (return their indices)
ca.inds <- atom.select(pdb, "calpha")
ca.inds
##
## Call: atom.select.pdb(pdb = pdb, string = "calpha")
##
## Atom Indices#: 198 ($atom)
## XYZ Indices#: 594 ($xyz)
##
## + attr: atom, xyz, call
head( pdb$atom[ca.inds$atom,] )
## type eleno elety alt resid chain resno insert x y z o
## 2 ATOM 2 CA <NA> PRO A 1 <NA> 30.307 38.663 5.319 1
## 9 ATOM 9 CA <NA> GLN A 2 <NA> 30.158 36.492 2.199 1
## 18 ATOM 18 CA <NA> ILE A 3 <NA> 29.123 33.098 3.397 1
## 26 ATOM 26 CA <NA> THR A 4 <NA> 29.774 30.143 1.062 1
## 33 ATOM 33 CA <NA> LEU A 5 <NA> 27.644 27.003 1.144 1
## 41 ATOM 41 CA <NA> TRP A 6 <NA> 30.177 24.150 1.279 1
## b segid elesy charge
## 2 40.62 <NA> C <NA>
## 9 41.30 <NA> C <NA>
## 18 34.13 <NA> C <NA>
## 26 30.14 <NA> C <NA>
## 33 30.12 <NA> C <NA>
## 41 30.82 <NA> C <NA>
head(pdb$atom)
## type eleno elety alt resid chain resno insert x y z o
## 1 ATOM 1 N <NA> PRO A 1 <NA> 29.361 39.686 5.862 1
## 2 ATOM 2 CA <NA> PRO A 1 <NA> 30.307 38.663 5.319 1
## 3 ATOM 3 C <NA> PRO A 1 <NA> 29.760 38.071 4.022 1
## 4 ATOM 4 O <NA> PRO A 1 <NA> 28.600 38.302 3.676 1
## 5 ATOM 5 CB <NA> PRO A 1 <NA> 30.508 37.541 6.342 1
## 6 ATOM 6 CG <NA> PRO A 1 <NA> 29.296 37.591 7.162 1
## b segid elesy charge
## 1 38.10 <NA> N <NA>
## 2 40.62 <NA> C <NA>
## 3 42.64 <NA> C <NA>
## 4 43.40 <NA> O <NA>
## 5 37.87 <NA> C <NA>
## 6 38.40 <NA> C <NA>
ligand.inds <- atom.select(pdb, "ligand")
ligand.inds
##
## Call: atom.select.pdb(pdb = pdb, string = "ligand")
##
## Atom Indices#: 45 ($atom)
## XYZ Indices#: 135 ($xyz)
##
## + attr: atom, xyz, call
head( pdb$atom[ligand.inds$atom,] )
## type eleno elety alt resid chain resno insert x y z
## 1515 HETATM 1517 N1 <NA> MK1 B 902 <NA> 9.280 23.763 3.004
## 1516 HETATM 1518 C1 <NA> MK1 B 902 <NA> 9.498 23.983 4.459
## 1517 HETATM 1519 C2 <NA> MK1 B 902 <NA> 10.591 24.905 4.962
## 1518 HETATM 1520 C3 <NA> MK1 B 902 <NA> 10.591 24.864 6.466
## 1519 HETATM 1521 O1 <NA> MK1 B 902 <NA> 10.937 23.849 7.057
## 1520 HETATM 1522 N2 <NA> MK1 B 902 <NA> 10.193 25.953 7.094
## o b segid elesy charge
## 1515 1 28.25 <NA> N <NA>
## 1516 1 30.30 <NA> C <NA>
## 1517 1 27.27 <NA> C <NA>
## 1518 1 28.85 <NA> C <NA>
## 1519 1 29.59 <NA> O <NA>
## 1520 1 22.29 <NA> N <NA>
ligand.pdb <- atom.select(pdb, "ligand", value=TRUE)
write.pdb(ligand.pdb, file="MK1.pdb")
Here is a snapshot figure of HIV-1 Protease (PDB CODE: 1HSG) made with VMD.
HIV protease structure 1HSG rendered by VMD
# Download some example PDB files
ids <- c("1TND_B","1AGR_A","1TAG_A","1GG2_A","1KJY_A","4G5Q_A")
files <- get.pdb(ids, split = TRUE)
## Warning in get.pdb(ids, split = TRUE): ./1TND.pdb exists. Skipping download
## Warning in get.pdb(ids, split = TRUE): ./1AGR.pdb exists. Skipping download
## Warning in get.pdb(ids, split = TRUE): ./1TAG.pdb exists. Skipping download
## Warning in get.pdb(ids, split = TRUE): ./1GG2.pdb exists. Skipping download
## Warning in get.pdb(ids, split = TRUE): ./1KJY.pdb exists. Skipping download
## Warning in get.pdb(ids, split = TRUE): ./4G5Q.pdb exists. Skipping download
##
|
| | 0%
|
|=========== | 17%
|
|====================== | 33%
|
|================================ | 50%
|
|=========================================== | 67%
|
|====================================================== | 83%
|
|=================================================================| 100%
Align these files
# Extract and align the chains we are interested in
pdbs <- pdbaln(files, fit = TRUE)
## Reading PDB files:
## ./split_chain/1TND_B.pdb
## ./split_chain/1AGR_A.pdb
## ./split_chain/1TAG_A.pdb
## ./split_chain/1GG2_A.pdb
## ./split_chain/1KJY_A.pdb
## ./split_chain/4G5Q_A.pdb
## ..... PDB has ALT records, taking A only, rm.alt=TRUE
## .
##
## Extracting sequences
##
## pdb/seq: 1 name: ./split_chain/1TND_B.pdb
## pdb/seq: 2 name: ./split_chain/1AGR_A.pdb
## pdb/seq: 3 name: ./split_chain/1TAG_A.pdb
## pdb/seq: 4 name: ./split_chain/1GG2_A.pdb
## pdb/seq: 5 name: ./split_chain/1KJY_A.pdb
## pdb/seq: 6 name: ./split_chain/4G5Q_A.pdb
## PDB has ALT records, taking A only, rm.alt=TRUE
pdbs
## 1 . . . . 50
## [Truncated_Name:1]1TND_B.pdb --------------------------ARTVKLLLLGAGESGKSTIVKQMK
## [Truncated_Name:2]1AGR_A.pdb LSAEDKAAVERSKMIDRNLREDGEKAAREVKLLLLGAGESGKSTIVKQMK
## [Truncated_Name:3]1TAG_A.pdb --------------------------ARTVKLLLLGAGESGKSTIVKQMK
## [Truncated_Name:4]1GG2_A.pdb LSAEDKAAVERSKMIDRNLREDGEKAAREVKLLLLGAGESGKSTIVKQMK
## [Truncated_Name:5]1KJY_A.pdb -------------------------GAREVKLLLLGAGESGKSTIVKQMK
## [Truncated_Name:6]4G5Q_A.pdb --------------------------AREVKLLLLGAGESGKSTIVKQMK
## ** *********************
## 1 . . . . 50
##
## 51 . . . . 100
## [Truncated_Name:1]1TND_B.pdb IIHQDGYSLEECLEFIAIIYGNTLQSILAIVRAMTTLNIQYGDSARQDDA
## [Truncated_Name:2]1AGR_A.pdb IIHEAGYSEEECKQYKAVVYSNTIQSIIAIIRAMGRLKIDFGDAARADDA
## [Truncated_Name:3]1TAG_A.pdb IIHQDGYSLEECLEFIAIIYGNTLQSILAIVRAMTTLNIQYGDSARQDDA
## [Truncated_Name:4]1GG2_A.pdb IIHEAGYSEEECKQYKAVVYSNTIQSIIAIIRAMGRLKIDFGDAARADDA
## [Truncated_Name:5]1KJY_A.pdb IIHEAGYSEEECKQYKAVVYSNTIQSIIAIIRAMGRLKIDFGDSARADDA
## [Truncated_Name:6]4G5Q_A.pdb IIHEAGYSEEECKQYKAVVYSNTIQSIIAIIRAMGRLKIDFGDSARADDA
## *** *** *** ^ *^^* **^***^**^*** * * ^** ** ***
## 51 . . . . 100
##
## 101 . . . . 150
## [Truncated_Name:1]1TND_B.pdb RKLMHMADTIEEGTMPKEMSDIIQRLWKDSGIQACFDRASEYQLNDSAGY
## [Truncated_Name:2]1AGR_A.pdb RQLFVLAGAAEEGFMTAELAGVIKRLWKDSGVQACFNRSREYQLNDSAAY
## [Truncated_Name:3]1TAG_A.pdb RKLMHMADTIEEGTMPKEMSDIIQRLWKDSGIQACFDRASEYQLNDSAGY
## [Truncated_Name:4]1GG2_A.pdb RQLFVLAGAAEEGFMTAELAGVIKRLWKDSGVQACFNRSREYQLNDSAAY
## [Truncated_Name:5]1KJY_A.pdb RQLFVLAGAAEEGFMTAELAGVIKRLWKDSGVQACFNRSREYQLNDSAAY
## [Truncated_Name:6]4G5Q_A.pdb RQLFVLAGAAEEGFMTAELAGVIKRLWKDSGVQACFNRSREYQLNDSAAY
## * * ^* *** * *^ ^* *******^**** * ********^*
## 101 . . . . 150
##
## 151 . . . . 200
## [Truncated_Name:1]1TND_B.pdb YLSDLERLVTPGYVPTEQDVLRSRVKTTGIIETQFSFKDLNFRMFDVGGQ
## [Truncated_Name:2]1AGR_A.pdb YLNDLDRIAQPNYIPTQQDVLRTRVKTTGIVETHFTFKDLHFKMFDVGGQ
## [Truncated_Name:3]1TAG_A.pdb YLSDLERLVTPGYVPTEQDVLRSRVKTTGIIETQFSFKDLNFRMFDVGGQ
## [Truncated_Name:4]1GG2_A.pdb YLNDLDRIAQPNYIPTQQDVLRTRVKTTGIVETHFTFKDLHFKMFDVGAQ
## [Truncated_Name:5]1KJY_A.pdb YLNDLDRIAQPNYIPTQQDVLRTRVKTTGIVETHFTFKDLHFKMFDVGGQ
## [Truncated_Name:6]4G5Q_A.pdb YLNDLDRIAQPNYIPTQQDVLRTRVKTTGIVETHFTFKDLHFKMFDVGGQ
## ** **^*^ * *^** *****^*******^** *^**** *^*****^*
## 151 . . . . 200
##
## 201 . . . . 250
## [Truncated_Name:1]1TND_B.pdb RSERKKWIHCFEGVTCIIFIAALSAYDMVLVEDDEVNRMHESLHLFNSIC
## [Truncated_Name:2]1AGR_A.pdb RSERKKWIHCFEGVTAIIFCVALSDYDLVLAEDEEMNRMHESMKLFDSIC
## [Truncated_Name:3]1TAG_A.pdb RSERKKWIHCFEGVTCIIFIAALSAYDMVLVEDDEVNRMHESLHLFNSIC
## [Truncated_Name:4]1GG2_A.pdb RSERKKWIHCFEGVTAIIFCVALSDYDLVLAEDEEMNRMHESMKLFDSIC
## [Truncated_Name:5]1KJY_A.pdb RSERKKWIHCFEGVTAIIFCVALSDYDLVLAEDEEMNRMHESMKLFDSIC
## [Truncated_Name:6]4G5Q_A.pdb RSERKKWIHCFEGVTAIIFCVALSDYDLVLAEDEEMNRMHESMKLFDSIC
## *************** *** *** **^** **^*^******^^** ***
## 201 . . . . 250
##
## 251 . . . . 300
## [Truncated_Name:1]1TND_B.pdb NHRYFATTSIVLFLNKKDVFSEKIKKAHLSICFPDYNGPNTYEDAGNYIK
## [Truncated_Name:2]1AGR_A.pdb NNKWFTDTSIILFLNKKDLFEEKIKKSPLTICYPEYAGSNTYEEAAAYIQ
## [Truncated_Name:3]1TAG_A.pdb NHRYFATTSIVLFLNKKDVFSEKIKKAHLSICFPDYNGPNTYEDAGNYIK
## [Truncated_Name:4]1GG2_A.pdb NNKWFTDTSIILFLNKKDLFEEKIKKSPLTICYPEYAGSNTYEEAAAYIQ
## [Truncated_Name:5]1KJY_A.pdb NNKWFTDTSIILFLNKKDLFEEKIKKSPLTICYPEYAGSNTYEEAAAYIQ
## [Truncated_Name:6]4G5Q_A.pdb NNKWFTDTSIILFLNKKDLFEEKIKKSPLTICYPEYAGSNTYEEAAAYIQ
## * ^^* ***^*******^* ***** *^**^*^* * ****^*^ **
## 251 . . . . 300
##
## 301 . . . . 350
## [Truncated_Name:1]1TND_B.pdb VQFLELNMRRDVKEIYSHMTCATDTQNVKFVFDAVTDIIIKE--------
## [Truncated_Name:2]1AGR_A.pdb CQFEDLNKRKDTKEIYTHFTCATDTKNVQFVFDAVTDVIIKNNLKDCGLF
## [Truncated_Name:3]1TAG_A.pdb VQFLELNMRRDVKEIYSHMTCATDTQNVKFVFDAVTDIII----------
## [Truncated_Name:4]1GG2_A.pdb CQFEDLNKRKDTKEIYTHFTCATDTKNVQFVFDAVTDVIIKNNL------
## [Truncated_Name:5]1KJY_A.pdb CQFEDLNKRKDTKEIYTHFTCATDTKNVQFVFDAVTDVIIKNNLK-----
## [Truncated_Name:6]4G5Q_A.pdb CQFEDLNKRKDTKEIYTHFTCATDTKNVQFVFDAVTDVIIKNNLKD----
## ** ^** *^* ****^* ****** ** ********^**
## 301 . . . . 350
##
## Call:
## pdbaln(files = files, fit = TRUE)
##
## Class:
## pdbs, fasta
##
## Alignment dimensions:
## 6 sequence rows; 350 position columns (314 non-gap, 36 gap)
##
## + attr: xyz, resno, b, chain, id, ali, resid, sse, call
View of these fit (i.e. superposed) structures
library(bio3d.view)
library(rgl)
view(pdbs)
rglwidget(width=500, height=500)
Lets try PCA of these structures
# Perform PCA
pc.xray <- pca(pdbs)
Look at results
plot(pc.xray)
# Visualize first principal component
pc1 <- mktrj(pc.xray, pc=1, file="pc_1.pdb")
View structures along PC1
view(pc1)
## Potential all C-alpha atom structure(s) detected: Using calpha.connectivity()
rglwidget(width=500, height=500)